Data cleaning

important_parties <- c(
  "PARTIDO SOCIALISTA OBRERO ESPAÑOL",
  "PARTIDO POPULAR",
  "CIUDADANOS",
  "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO",
  "BLOQUE NACIONALISTA GALEGO",
  "CONVERGÈNCIA I UNIÓ",
  "UNIDAS PODEMOS - IU",
  "ESQUERRA REPUBLICANA DE CATALUNYA",
  "EH - BILDU",
  "MÁS PAÍS",
  "VOX"
)

election_data_tidy <- election_data |> 
  pivot_longer(cols = -(1:15), names_to = "party", values_to = "votes") |> 
  mutate(
    party_recoded = case_when(
      str_detect(party, "PARTIDO SOCIALISTA OBRERO ESPAÑOL|PARTIT DELS SOCIALISTES DE CATALUNYA|PARTIDO SOCIALISTA DE EUSKADI|PARTIDO DOS SOCIALISTAS DE GALICIA") ~ "PARTIDO SOCIALISTA OBRERO ESPAÑOL",
      str_detect(party, "PARTIDO DE LA CIUDADANIA|PARTIDO DE LA CIUDADANÍA") ~ "CIUDADANOS-PARTIDO DE LA CIUDADANIA",
      str_detect(party, "EH - BILDU|ARALAR|ALTERNATIBA|EUSKO ALKARTASUNA") ~ "EUSKAL HERRIA BILDU",
      str_detect(party, "UNIDAS PODEMOS|EN MAREA|PODEM|EZKER BATUA|IZQUIERDA UNIDA|ESQUERRA UNIDA|ESQUERDA UNIDA") ~ "PODEMOS",
      str_detect(party, "CONVERGÈNCIA I UNIÓ|CONVERGENCIA I UNIO|DEMOCRÀCIA I LLIBERTAT|CONVERGÈNCIA i UNIÓ ") ~ "CONVERGENCIA I UNIO",
      str_detect(party, "BLOQUE NACIONALISTA GALEGO|CANDIDATURA GALEGA") ~ "BLOQUE NACIONALISTA GALEGO",
      str_detect(party, "PARTIDO POPULAR") ~ "PARTIDO POPULAR",
      str_detect(party, "MÁS PAÍS") ~ "MÁS PAÍS",
      str_detect(party, "ESQUERRA REPUBLICANA DE CATALUNYA|ESQUERRA REPUBLICANA/CATALUNYA") ~ "ESQUERRA REPUBLICANA DE CATALUNYA",
      party %in% important_parties ~ party,
      TRUE ~ "OTHER"
    ),
    date = glue("{anno}-{mes}-01") |> as_date()
  ) |> 
  unite("cod_mun", codigo_ccaa, codigo_provincia, codigo_municipio, sep = "-", remove = FALSE) |> 
  left_join(
    abbrev |> 
      distinct(denominacion, .keep_all = TRUE) |> 
      mutate(siglas = case_when(
        siglas == "C's" ~ "CS",
        siglas == "EH Bildu" ~ "EH BILDU",
        siglas == "M PAÍS" ~ "MP",
        TRUE ~ siglas
      )), 
    by = c("party_recoded" = "denominacion")
  ) |> 
  left_join(cod_mun, by = "cod_mun") |> 
  select(-vuelta, -tipo_eleccion, -codigo_distrito_electoral) |> 
  drop_na(votes) |> 
  mutate(
    siglas = if_else(is.na(siglas),"OTHER", siglas)
  )

surveys_tidy <- surveys |> 
  pivot_longer(cols = -(1:10), names_to = "party", values_to = "estimation")  |>
  filter(
    year(date_elec) >= 2008,
    exit_poll == FALSE,
    size >= 750,
    field_date_to - field_date_from >= 1
  ) |>
  select(-type_survey) |>
  drop_na(size) 

Question 1

Which party was the winner in the municipalities with more than 100,000 habitants (census) in each of the elections?

# Filter
large_municipalities <- election_data_tidy |>
  filter(censo > 100000)

# Winning parties
winners <- large_municipalities |> 
  group_by(date, municipio) |> 
  slice_max(votes, n = 1, with_ties = FALSE) |> 
  select(date, party_recoded, municipio, censo)

# Number of municipalities won
winners_by_party <- winners |> 
  group_by(date, party_recoded)|> 
  summarize(num_municipalities = n(), .groups = "drop") |> 
  arrange(date, desc(num_municipalities))

font_add_google("Roboto Condensed", "Roboto")
# Date as factor
winners$date <- factor(winners$date, levels = unique(winners$date))

gmun <- ggplot(winners, aes(x = date, y = municipio, fill = party_recoded)) +
  geom_tile(color = "white") +
  scale_fill_manual(
    values = c(
      c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "CONVERGENCIA I UNIO" = "#1b348a",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "#ffbf41")
      ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "OTHER",
      "PODEMOS" = "PODEMOS",
      "VOX" = "VOX",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "ERC"
    )
  ) +
  labs(
    title = "Winning party in municipalities with more than 100,000 habitants",
    x = "Date of election",
    y = "Municipality",
    fill = "Parties"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"),
  ) 
general_election_winners <- data.frame(
  date = as.Date(c("2008-03-01", "2011-11-01", "2015-12-01", "2016-06-01", "2019-04-01", "2019-11-01")),
  party_recoded = c(
    "PARTIDO SOCIALISTA OBRERO ESPAÑOL",  
    "PARTIDO POPULAR",                    
    "PARTIDO POPULAR",                    
    "PARTIDO POPULAR",                    
    "PARTIDO SOCIALISTA OBRERO ESPAÑOL",  
    "PARTIDO SOCIALISTA OBRERO ESPAÑOL")  
)

# End_date column
general_election_winners$end_date <- as.Date(c(
  "2011-11-01", "2015-12-01", "2016-06-01", "2019-04-01", "2019-11-01", "2019-11-01"
))

# Graph with the election winner colour as background
gwin <- ggplot(winners_by_party, aes(x = date, y = num_municipalities, color = party_recoded)) +
  geom_rect(data = general_election_winners,
            aes(xmin = date, xmax = end_date, ymin = -Inf, ymax = Inf, fill = party_recoded),
            alpha = 0.1, inherit.aes = FALSE) +
  geom_line(size = 1) +
  geom_point(size = 3) +  
  scale_color_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "CONVERGENCIA I UNIO" = "#1b348a",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "#ffbf41"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "VOX" = "Vox",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "ERC"
    ))  +
  scale_fill_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE")
  ) +
  geom_vline(data = general_election_winners, aes(xintercept = as.numeric(date)),
             color = "gray50", linetype = "dashed", size = 0.4) +
  labs(
    title = "Evolution of winning party in municipalities with more than 100,000 habitants",
    x = "Date of Election",
    y = "Number of Municipalities",
    color = "Winner in each municipality",
    fill = "General Election Winner"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"),
  )

Question 2

Which party was the second when the first was the PSOE? And when the first was the PP?

ranked_votes <- election_data_tidy |> 
  group_by(date, municipio) |> 
  arrange(desc(votes)) |> 
  mutate(rank = row_number()) |> 
  ungroup()

# PSOE is first
second_psoe <- ranked_votes |> 
  group_by(date, municipio) |> 
  filter(rank == 1 & party_recoded == "PARTIDO SOCIALISTA OBRERO ESPAÑOL") |> 
  left_join(
    ranked_votes |> 
      filter(rank == 2) |> 
      select(date, municipio, second = party_recoded, votes = votes),
    by = c("date", "municipio")
  ) |> 
  ungroup()

second_psoe_sum <- second_psoe |> 
  group_by(date, second) |> 
  summarize(
    num_municipalities = n(),
    .groups = "drop"
  ) |> 
  arrange(date, desc(num_municipalities))

# PP is first
second_pp <- ranked_votes |> 
  group_by(date, municipio) |> 
  filter(rank == 1 & party_recoded == "PARTIDO POPULAR") |> 
  left_join(
    ranked_votes |> 
      filter(rank == 2) |> 
      select(date, municipio, second = party_recoded, votes = votes),
    by = c("date", "municipio")
  ) |> 
  ungroup()

second_pp_sum <- second_pp |> 
  group_by(date, second) |> 
  summarize(
    num_municipalities = n(),
    .groups = "drop"
  ) |> 
  arrange(date, desc(num_municipalities))

second_combined <- bind_rows(
  second_pp_sum |>  
    mutate(first = "PP"),
  second_psoe_sum |> 
    mutate(first = "PSOE"))

second_combined$date <- factor(second_combined$date, levels = unique(second_combined$date))

# Stacked barplot with facets
gsec <- ggplot(second_combined, aes(x = date, y = num_municipalities, fill = second)) +
  geom_bar(stat = "identity", position = "fill", color = "black") +
  scale_fill_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "BLOQUE NACIONALISTA GALEGO" = "lightblue",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "orange",
      "VOX" = "#83b431",
      "CONVERGENCIA I UNIO" = "#1b348a",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "yellow"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "ERC"
    )
  ) +
  labs(
    title = "Second places when PSOE or PP were first",
    x = "Election Date",
    y = "Number of Municipalities",
    fill = "Second Party"
  ) +
  facet_wrap(~ first, scales = "free_y", labeller = labeller(first = c(PP = "PP First", PSOE = "PSOE First"))) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 14, face = "bold", family = "Roboto"),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black", angle = 20),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  )
# Names and siglas for the parties to avoid NA
second_combined <- second_combined |> 
  mutate(
    first = recode(first,
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA" = "ERC"
    ),
    second = recode(second,
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA" = "ERC"
    )) |> 
  filter(!(second == "PNV" | second == "BNG" | second == "ERC"))
  
# New graph
gsec2 <- ggplot(second_combined, aes(
  axis1 = first, axis2 = second,
  y = num_municipalities, fill = second
)) +
  geom_alluvium(aes(fill = second), width = 1/6) +
  geom_stratum(aes(fill = after_stat(stratum)), width = 1/6, color = "black") +
 geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3.5, color = "black", fontface = "bold") +
  scale_fill_manual(
    values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "Others" = "gray60",
      "Podemos" = "#a444b4",
      "PNV" = "darkgreen",
      "BNG" = "lightblue",
      "C's" = "orange",
      "VOX" = "#83b431",
      "CiU" = "#1b348a",
      "ERC" = "yellow"
    )
  ) +
  labs(
    title = "Flow of municipalities won: First to second party",
    x = "First to Second Party",
    y = "Number of Municipalities",
    fill = "Second Party"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 17, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 12, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 12, family = "Roboto", color = "black"),
    axis.title.x = element_text(size = 13),
    axis.title.y = element_text(size = 13),
    legend.title = element_text(size = 13, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 11, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  )

Question 3

Who benefits from low turnout?

election_data_tidy <- election_data_tidy |> 
  group_by(cod_mun, date, party_recoded) |> 
  mutate(
    total_votes = votos_blancos + votos_nulos + votos_candidaturas,
    turnout = total_votes / censo,
    votes_recoded = sum(votes, na.rm = TRUE),
    vote_share_by_party = votes_recoded / total_votes
  ) |> 
  ungroup()

# Let's try to visualise the model in a meaningful way

# regression model with interaction between turnout and party
model_all <- lm(vote_share_by_party ~ turnout * party_recoded, data = election_data_tidy)

model_summary <- summary(model_all)


# From the coefficents table, I extract the ones that starts with "turnout:party_recoded"

model_coefficients <- model_summary$coefficients

party_coeffs <- model_coefficients[grepl("turnout:party_recoded", rownames(model_coefficients)), , drop = FALSE]

# This part is to delete potential occurrences where there could be duplicates. Parties and abbs are matched.

party_mapping <- election_data_tidy |> 
  select(party_recoded, siglas) |> 
  distinct()

# For erasing the silly part from the row names, ensuring there are no hidden spacing

party_names <- gsub("turnout:party_recoded", "", rownames(party_coeffs))  
party_names <- trimws(party_names)  # Clean names

# Matching party abbreviations and coefficients in a data frame
party_coeff_df <- data.frame(
  party = party_names,
  coefficient = party_coeffs[, "Estimate"]
)

# # Finally merging them all for getting party abbreviations and coefficients!

party_coeff_df <- party_coeff_df |> 
  left_join(party_mapping, by = c("party" = "party_recoded")) |> 
  select(siglas, coefficient) %>%
  rename(party_abbrev = siglas)

# For inserting size, colour and legend information to wordcloud
party_coeff_df <- party_coeff_df |> 
  mutate(
    size = abs(coefficient),  # abs values will decide size of each party's size
    category = ifelse(coefficient < 0, "Negative", "Positive")
  )

# trying to create a logical colouring......

color_pal <- c(
  "Negative" = "#980043",
  "Positive" = "#9e9ac8")

# I then assign colours to each party based on their category
party_coeff_df <- party_coeff_df |> 
  mutate(color = color_pal[category])

Question 4

How to analyze the relationship between census and vote?

cen_vote <- ggplot(election_data_tidy, aes(x = censo, y = vote_share_by_party, colour = siglas)) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Party-Specific Trends: Vote Share vs Census",
    x = "Census",
    y = "Vote Share",
    colour = "Party"
  ) +
  theme_minimal() +
  scale_colour_manual(values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "ERC" = "#ffbf41",
      "CIU" = "#1b348a",
      "MP" = "#004938",
      "CS" = "#eb6109",
      "EAJ-PNV" = "darkgreen",
      "BNG" = "lightblue",
      "EH BILDU" = "#03cfb4"
    )) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 17, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 12, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 12, family = "Roboto", color = "black"),
    axis.title.x = element_text(size = 13),
    axis.title.y = element_text(size = 13),
    legend.title = element_text(size = 13, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 11, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  )

Is it true that certain parties win in rural areas?

(Gómez Valenzuela and Holl 2023)

  • Rural < 10000 recorded citizens in the census
  • Urban > 10000 recorded citizens in the census
# For the second part - rural vs urban

election_data_tidy <- election_data_tidy |> 
  mutate(
    area_type = ifelse(censo < 10000, "Rural", "Urban")
  )

rural_municipalities <- election_data_tidy |>
  filter(area_type == "Rural")

# Winning parties
winners_rural <- rural_municipalities |> 
  group_by(date, municipio) |> 
  slice_max(votes_recoded, n = 1, with_ties = FALSE) |> 
  select(date, party_recoded, municipio, censo)

# Number of municipalities won
winners_by_party_rural <- winners |> 
  group_by(date, party_recoded)|> 
  summarize(num_municipalities = n(), .groups = "drop") |> 
  arrange(date, desc(num_municipalities))|> 
  ungroup() 

urban_municipalities <- election_data_tidy |>
  filter(area_type == "Urban")

# Winning parties
winners_urban <- urban_municipalities |> 
  group_by(date, municipio) |> 
  slice_max(votes_recoded, n = 1, with_ties = FALSE) |> 
  select(date, party_recoded, municipio, censo)

# Number of municipalities won
winners_by_party_urban <- winners_urban |> 
  group_by(date, party_recoded)|> 
  summarize(num_municipalities = n(), .groups = "drop") |> 
  arrange(date, desc(num_municipalities))|> 
  ungroup() 

type_combined <- rbind(
  winners_by_party_rural %>% mutate(type = "Rural"),
  winners_by_party_urban %>% mutate(type = "Urban")
)

type_combined$date <- factor(type_combined$date, levels = unique(type_combined$date))

rural_urban <- ggplot(type_combined, aes(x = date, y = num_municipalities, fill = party_recoded)) +
  geom_bar(stat = "identity", position = "fill", color = "black") +
  scale_fill_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "BLOQUE NACIONALISTA GALEGO" = "lightblue",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "orange",
      "VOX" = "#83b431",
      "CONVERGENCIA I UNIO" = "#1b348a",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "yellow"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "ERC"
    )
  ) +
  labs(
    title = "Wins depending on the type of the area",
    x = "Election Date",
    y = "Number of Municipalities",
    fill = "Parties"
  ) +
  facet_wrap(~ type, scales = "free_y") +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 14, face = "bold", family = "Roboto"),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black", angle = 20),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  ) 

Question 5

How to calibrate the error of the polls (remember that the polls are voting intentions at national level)?

elections_aggregated2 <- election_data_tidy |> 
  group_by(date, cod_mun) |> 
  distinct(total_votes, .keep_all = TRUE) |> 
  summarize(
    participation = sum(total_votes),
    .groups = "drop"
  ) |> 
  group_by(date) |> 
  summarise(participation_total = sum(participation),
    .groups = "drop"
  )

elections_aggregated1 <- election_data_tidy |> 
  group_by(date, party_recoded, siglas) |> 
  summarize(
    total_votes_all = sum(votes, na.rm = TRUE),
    .groups = "drop"
  ) 

elections_with_shares <- elections_aggregated1 |> 
  left_join(elections_aggregated2, by = "date") |> 
  mutate(vote_share = (total_votes_all / participation_total) * 100)

poll_calibration <- surveys_tidy |> 
  mutate(year_month_elec = floor_date(date_elec, "month")) |> # I will extract year and month (lubridate package)
  left_join(
    elections_with_shares |> 
      mutate(year_month = floor_date(date, "month")),          # I will extract year and month
    by = c("year_month_elec" = "year_month", "party" = "siglas")
  )

poll_calibration <- poll_calibration |> 
  mutate(error = estimation - vote_share)

Question 6

Which polling houses got it right the most and which ones deviated the most from the results?

# Error analysis: summary of errors by pollster or any other factor

error_analysis <- poll_calibration |> 
  group_by(pollster) |> 
  summarize(
    sd_error = sd(error, na.rm = TRUE)
  )

poll_calibration <- poll_calibration |> 
  mutate(abs_error = abs(error))  

pollster_accuracy <- poll_calibration |> 
  group_by(pollster) |> 
  summarize(
    mean_abs_error = mean(abs_error, na.rm = TRUE),# mean abs error column
    sd_abs_error = sd(abs_error, na.rm = TRUE)
  ) |> 
  arrange(mean_abs_error)

# Bar plot to represent the MAE for the hightst 10 pollsters
top_pollsters <- pollster_accuracy %>%
  slice_max(mean_abs_error, n = 10)  

top_error_pollster <- ggplot(top_pollsters, aes(x = reorder(pollster, mean_abs_error), y = mean_abs_error)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  labs(
    title = "Top 10 Pollster with Highest Mean Absolute Error",
    x = "Pollster",
    y = "Mean Absolute Error"
  ) +
  coord_flip() +
  theme_minimal()
# Bar plot to represent the MAE for the lowest 10 pollsters
low_pollsters <- pollster_accuracy %>%
  slice_min(mean_abs_error, n = 10)  

low_error_pollsters <- ggplot(low_pollsters, aes(x = reorder(pollster, mean_abs_error), y = mean_abs_error)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  labs(
    title = "Top 10 Pollster with Lowest Mean Absolute Error",
    x = "Pollster",
    y = "Mean Absolute Error"
  ) +
  coord_flip() +
  theme_minimal()

Creative 1

How does polling error vary by party?

polling_error_by_party <- poll_calibration |> 
  group_by(party) |> 
  summarize(
    mean_abs_error = mean(abs_error, na.rm = TRUE),
    sd_abs_error = sd(abs_error, na.rm = TRUE)
  ) |> 
  arrange(mean_abs_error) |> 
  drop_na(sd_abs_error)

Creative 2

By finding the most successful two parties for each year, calculate a polarisation index. Then, compare polarisation of vote of no confidence elections with the rest

polarization_calc <- function(data, year) {

if (!year %in% c(2008, 2011, 2015, 2016, 2019, "all")) {
    warning("Hey you! The year has to be one of these values: 2008, 2011, 2015, 2016, 2019, or 'all' (in quotes!) if you want to see the information for all years. Thanks :). Output:")
    return(NULL)
  }  
  
elections_processed <- data |> 
   mutate(votos_candidaturas_complete = 
            votos_blancos + votos_nulos + votos_candidaturas)

elections_aggregated_total <- elections_processed |> 
  group_by(date, cod_mun) |> 
  distinct(votos_candidaturas_complete, .keep_all = TRUE) |> 
  summarize(
    participation = sum(votos_candidaturas_complete),
    .groups = "drop"
  ) |> 
  group_by(date) |> 
  summarise(participation_total = sum(participation),
    .groups = "drop" 
  )

elections_aggregated_parties <- elections_processed |> 
  group_by(date, party_recoded) |> 
  summarize(
    total_votes_all = sum(votes, na.rm = TRUE),
    .groups = "drop"
  ) 

elections_top_parties <- 
  elections_aggregated_parties |> 
  group_by(date) |> 
  slice_max(total_votes_all, n = 2) |> 
  summarise(top_parties_votes = sum(total_votes_all))
  
  polarization_index <- elections_top_parties |> 
    left_join(elections_aggregated_total, by = "date") |> 
    mutate(
      polarization_index = top_parties_votes / participation_total
    )
  if (year != "all") {
    polarization_index <- 
      polarization_index |> 
      filter(year(date) == year)
  }
 return(polarization_index)
}

Creative 3

How has the vote in Catalonia changed over the years studied?

# Important parties not included in the previous recodification
new_parties <- c("JUNTS PER CATALUNYA-JUNTS", 
                 "CONVERGÈNCIA DEMOCRÀTICA DE CATALUNYA")

catalunya <- election_data_tidy |>
  mutate(
    party_recoded = case_when(
      party %in% new_parties ~ party,
      TRUE ~ party_recoded
    ),
    siglas = case_when(
      party_recoded == "JUNTS PER CATALUNYA-JUNTS" ~ "CIU",
      party_recoded == "CONVERGÈNCIA DEMOCRÀTICA DE CATALUNYA" ~ "CIU", #included in CiU
      TRUE ~ siglas
    ),
    province = case_when(
      codigo_provincia == "08" ~ "Barcelona",
      codigo_provincia == "17" ~ "Girona",
      codigo_provincia == "25" ~ "Lleida",
      codigo_provincia == "43" ~ "Tarragona",
      TRUE ~ "Unknown")) |> 
  filter(codigo_ccaa == "09") |> 
  distinct(date, cod_mun, party_recoded, .keep_all = TRUE)

catalunya_votes <- catalunya |>
  filter(party_recoded != "OTHER") |> 
  group_by(date, province, siglas) |> 
  summarize(total_votes = sum(votes, na.rm = TRUE), .groups = "drop") |> 
  group_by(date, province) |> 
  mutate(vote_percentage_parties = (total_votes / sum(total_votes)) * 100) |> 
  ungroup()

gcat <- ggplot(catalunya_votes, aes(x = date, y = vote_percentage_parties, 
                                 colour = siglas, group = siglas)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  facet_wrap(~ province, ncol = 2) +
  scale_color_manual(
    values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "ERC" = "#ffbf41",
      "CIU" = "#1b348a",
      "MP" = "#004938",
      "CS" = "#eb6109"
      )) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "General elections in Catalonia over the years",
    x = "Date of the election",
    y = "Vote Share (%)",
    color = "Party",
    caption = "Source: Electoral Data"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", 
                              hjust = 0.5, margin = margin(b = 20)),
    strip.text = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 8, color = "black", angle = 45, hjust = 1),
    axis.text.y = element_text(size = 8, color = "black"),
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 8),
    legend.box.background = element_rect(color = "black", linewidth = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"))

What happens if we group the parties into pro-independence and non-independence parties?

independentist_parties <- c("ERC", "CIU") #just ERC and CIU

dataset <- catalunya_votes |> 
  mutate(independentist = ifelse(siglas %in% independentist_parties, 
                                 "Independentist", "Non-Independentist"))

# Percentage of vote per bloc and province
votos_porcentaje_prov <- dataset |> 
  group_by(date, independentist, province) |> 
  summarise(total_votes = sum(total_votes), .groups = "drop") |> 
  group_by(date, province) |> 
  mutate(percentage = (total_votes / sum(total_votes)) * 100)

# Graph comparing the blocs and the provinces
gcat2 <- ggplot(votos_porcentaje_prov, aes(x = date, y = percentage, color = independentist, group = independentist)) +
  geom_line(size = 1.2) +  
  geom_point(size = 2) +   
  scale_color_manual(
    values = c("Independentist" = "darkblue", "Non-Independentist" = "darkred"),
    labels = c("Independentist" = "Pro-Independence", "Non-Independentist" = "Non-Independence")
  ) +
  labs(
    title = "Evolution of Pro-Independence vs Non-Independence Vote",
    x = "Election Year",
    y = "Percentage of Votes (%)",
    color = "Political Bloc:",
    caption = "Source: Electoral Data"
  ) +
  facet_wrap(~ province, ncol = 2) +  
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 8, color = "black", angle = 45, hjust = 1),
    axis.text.y = element_text(size = 8, color = "black"),
    legend.title = element_text(size = 10, face = "bold"),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    legend.position = "top",
    strip.text = element_text(size = 12, face = "bold"),  
    plot.margin = margin(10, 10, 10, 10)
  )

Creative 4: Chaos

# selected municipios 
race <- election_data_tidy |> 
  filter(municipio %in% c("Alpedrete", "Moaña", 
                          "Cornellà de Llobregat",
                          "Manacor", "Madrid", "Bilbao", 
                          "Vitoria-Gasteiz", "Gijón"))

# winners by year and municipio
winners <- race |> 
  group_by(municipio, date) |> 
  arrange(desc(votes)) |> 
  slice(1) |> # most votes = winners 
  ungroup()


# record winning party from previous year
stability <- winners |> 
  arrange(municipio, date) |>
  group_by(municipio) |>
  mutate(
    previous_party = lag(party_recoded), # winning party previous election
    party_change = ifelse(party_recoded == previous_party, 0, 1) # 1 if the party changed, 0 otherwise
  ) |>
  ungroup()


# sum stability/chaos for each municipality
chaos_summary <- stability |>
  group_by(municipio) |>
  summarise(
    total_changes = sum(party_change, na.rm = TRUE), # 
    total_comparisons = n() - 1, # total comparisons (changes between elections = elections -1)
    chaos_rate = total_changes / total_comparisons    # Proportion of elections with a change
  )

# visuals  

chaos_withlabels <-
  ggplot(chaos_summary, 
         aes(x = reorder(municipio, chaos_rate), 
             y = chaos_rate, 
             fill = chaos_rate)) +
  geom_col(width = 0.7, alpha = 0.9) +
  #xlim(0, 0.8) + 
  scale_fill_viridis(option = "plasma", direction = -1, name = "Chaos Rate") +
  geom_text(aes(label = municipio), 
            hjust = 1.1, color = "white", size = 5, fontface = "bold") +
  geom_text(aes(label = scales::percent(chaos_rate, accuracy = 1)), 
            hjust = -0.1, size = 4, fontface = "bold") +
  labs(
    title = "Chaos Rate of Municipalities in Elections",
    subtitle = "Proportion of elections with a change in the winning party",
    x = "Municipality",
    y = "Chaos Rate (%)"
  ) +
  coord_flip() + 
  scale_y_continuous(
    limits = c(0, 0.7),
    breaks = seq(0.2, 1, by = 0.2),
    labels = c("1/5", "2/5", "3/5", "4/5", "5/5") 
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_blank(),  
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black"),
    plot.title = element_text(face = "bold", size = 16, family = "Roboto"),
    plot.subtitle = element_text(size = 12, family = "Roboto"), 
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5), 
    panel.grid.major.y = element_blank(), 
    legend.key.size = unit(1, "lines")
  )

But… who did they vote?

Bibliography

Gómez Valenzuela, Víctor, and Adelheid Holl. 2023. “Growth and Decline in Rural Spain: An Exploratory Analysis.” European Planning Studies 32 (2): 430–53. https://doi.org/10.1080/09654313.2023.2179390.